1 Executive Summary

1.0.1 Check for outliers in saleprice, and complete one-hot-encoding. Then delete the variables that were one-hot encoded

2 Introduction

2.1 Competition description

The description is at this link - https://www.kaggle.com/c/house-prices-advanced-regression-techniques

3 Exploratory Data Analysis

3.1 Loading required libraries

Loading required libraries into R workspace

library(knitr)
library(DataExplorer) #For data exploration 
library(DescTools)
library(psych)  # describe()
library(plyr)
library(rowr)
library(corrplot) # For graphical correlations
library(caret)
library(Hmisc) # For correlation matrix 
library(rlist)
library(randomForest)
library(e1071) # tune() for random forests

3.2 Reading CSV’s

Reading the csv’s as dataframes into R.

setwd("/Users/chetanak/Box Sync/Projects/DataScience/R/house-prices-advanced-regression-techniques/")
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)

3.3 Data Description & Structure

The test and train datasets have 1460 and 1459 rows each respectively. The test and train datasets have 81 and 80 columns each respectively. A difference in the columns variables across two data sets reveal absence of ‘SalePrice’ predictor variable in test data. An ‘ID’ column variable is present in each of the datasets. However, this column is not essential in prediction rather this column in test data set is required for submission. Hence, saving the ‘test$ID’ values to a vector ‘test_ids’ and eliminating the ‘ID’ column in both the datasets. We now bind the datasets for exploratory analysis. After binding, the combined data set contains 2919 and 80 columns. Of these 80, 79 are predictor variables and one is a response variable, which is ‘SalePrice’

  • The data structure shows both character and integer variables with null values on a few of those variables
  • The most of the character variables are ordinal factors, but I read them as strings for data cleaning. Post cleaning, we will see if converting any or all of these factors to integer is beneficial
dim(train)
[1] 1460   81
dim(test)
[1] 1459   80
setdiff(names(train), names(test))
[1] "SalePrice"
test_ids <- test$Id
train$Id <- NULL
test$Id <- NULL
test$SalePrice <- NA
df <- rbind(train, test)
dim(df)
[1] 2919   80
str(df, list.len=10)
'data.frame':   2919 obs. of  80 variables:
 $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
 $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
 $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
 $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
 $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
 $ Alley        : chr  NA NA NA NA ...
 $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
 $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
 $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
 $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
  [list output truncated]

3.4 Handling Missing Data, Factoring, Encoding

3.4.1 Prep Data Handling

The best way to analyze each varaible is by grouping them under their respective category. After grouping the variabels I am going to,

  1. Handle missing data
  2. Factorize the required variables
  3. Encode variables as needed

Subsequent tabs above are going detail the steps taken to handle missing data, factorizing, encoding of the variables. Below is the variable grouping and the missing counts in each variable

garage_vars <- names(df[which(names(df) %like% "%Garage%")])
basement_vars <- names(df[which(names(df) %like% "%Bsmt%")])
pool_vars <- names(df[which(names(df) %like% "%Pool%")])

porch_vars <- names(df[which(names(df) %like% c("%Porch%", "%Deck%"))])

sale_vars <- names(df[which(names(df) %like% c("%Sale%", "%Sold"))])

lot_vars <- names(df[which(names(df) %like% c("%Lot%", "%Land%"))])

dwelling_vars <- names(df[which(names(df) %like% c("%SubClass%", "%Bldg%", "%HouseStyle%", "%Overall%", "%Year%"))])

exterior_vars <- names(df[which(names(df) %like% c("%Exter%", "%Roof%", "%MasVnr%", "%Foundation%", "%Street%", "%Alley%", "%PavedDrive%", "%Fence%"))])

utility_vars <- names(df[which(names(df) %like% c("%Heat%", "%Utilities%", "%Central%", "%Electrical%"))])

interior_vars <- names(df[which(names(df) %like% c("%Room%", "FullBath%", "HalfBath%", "%Kitchen%", "%Fire%", "%AbvGr%", "%Functional%", "%FlrSF%", "%LowQualFinSF%", "%GrLivArea%"))])

misc_vars <- names(df[which(names(df) %like% c("%Misc%"))])

zoning_vars <- names(df[which(names(df) %like% c("%Zoning%"))])

neighborhood_vars <- names(df[which(names(df) %like% c("%Neighborhood%", "Condition%"))])


vars_list = list(garage_vars, basement_vars, pool_vars, porch_vars, sale_vars, lot_vars, 
                 dwelling_vars, exterior_vars, utility_vars, interior_vars, misc_vars, 
                 zoning_vars, neighborhood_vars)

df_var <- as.data.frame(do.call(cbind.fill, c(vars_list, fill = NA)))
colnames(df_var) <-  c("GarageVars", "BasementVars", "PoolVars", "PorchVars", "SaleVars", "LotVars",
                        "DwellingVars", "ExteriorVars", "UtilityVars", "InteriorVars", "MiscVars", 
                        "ZomingVars", "NeighborhoodVars")
df_var
#Show the  missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA
      PoolQC  MiscFeature        Alley        Fence    SalePrice 
        2909         2814         2721         2348         1459 
 FireplaceQu  LotFrontage  GarageYrBlt GarageFinish   GarageQual 
        1420          486          159          159          159 
  GarageCond   GarageType     BsmtCond BsmtExposure     BsmtQual 
         159          157           82           82           81 
BsmtFinType2 BsmtFinType1   MasVnrType   MasVnrArea     MSZoning 
          80           79           24           23            4 
   Utilities BsmtFullBath BsmtHalfBath   Functional  Exterior1st 
           2            2            2            2            1 
 Exterior2nd   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF 
           1            1            1            1            1 
  Electrical  KitchenQual   GarageCars   GarageArea     SaleType 
           1            1            1            1            1 

3.4.2 Garage Variables

There are 7 Garage variables, out of which GarageYrBlt, GarageFinish, GarageQual, GarageCond have 159 NA’s, GarageType has 157 NA’s, GarageCars and Garage Area have 1 NA each.

df_NA[c(which(names(df_NA) %like% "Garage%"))]
 GarageYrBlt GarageFinish   GarageQual   GarageCond   GarageType 
         159          159          159          159          157 
  GarageCars   GarageArea 
           1            1 
cbind(sapply(df[garage_vars], class))
             [,1]       
GarageType   "character"
GarageYrBlt  "integer"  
GarageFinish "character"
GarageCars   "integer"  
GarageArea   "integer"  
GarageQual   "character"
GarageCond   "character"

First we need to check which of the 159 NA’s in the variables are common with 157 NA’s in GarageType


length(which(is.na(df$GarageYrBlt) & is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond)  & is.na(df$GarageType)))
## [1] 157

There are 157 rows that are common across GarageYrBlt, GarageFinish, GarageQual, GarageCond, GarageType Let’s fix the values for these variables by selecting the rows based on GarageType NA’s since we know for sure that GarageType NA’s are in the 159 NA’s for other Garage variables. We assign the value ‘None’ to all NA’s based on the variable description in the previous tab.

df$GarageFinish[is.na(df$GarageType)] <- 'None'
df$GarageQual[is.na(df$GarageType)] <- 'None'
df$GarageCond[is.na(df$GarageType)] <- 'None'
df$GarageYrBlt[is.na(df$GarageType)] <- 0   # Integer Variable
df$GarageType[is.na(df$GarageType)] <- 'None'

#Show the  missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "Garage%"))]
##  GarageYrBlt GarageFinish   GarageQual   GarageCond   GarageCars 
##            2            2            2            2            1 
##   GarageArea 
##            1

Now we have 2 NA’s in each GarageYrBlt, GarageFinish, GarageQual, GarageCond variables and 1 NA each in GarageCars and GarageArea

kable(df[which(is.na(df$GarageYrBlt) | is.na(df$GarageFinish) | is.na(df$GarageQual) | is.na(df$GarageCond)), c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")])
GarageYrBlt GarageFinish GarageQual GarageCond GarageCars GarageArea
2127 NA NA NA NA 1 360
2577 NA NA NA NA NA NA

Is is evident that row 2127 has a Garage since the GarageArea is 360 and row 2577 has none. Fixing row 2577 values as below by assigning 0 value to integer variables and ‘None’ to character variables

df$GarageYrBlt[2577] <- 0
df$GarageFinish[2577] <- 'None'
df$GarageQual[2577] <- 'None'
df$GarageCond[2577] <- 'None'
df$GarageCars[2577] <- 0
df$GarageArea[2577] <- 0
df[2577, c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")]

Let us fix row 2127 for these variables. For GarageYrBlt variable, I will impute the YearBuild value. For other values I will impute with mode (the most frequent value) in the respective column

df$GarageYrBlt[2127] <- df$YearBuilt[2127]
df$GarageFinish[2127] <- names(sort(table(df$GarageFinish), decreasing=TRUE)[1])
df$GarageQual[2127] <- names(sort(table(df$GarageQual), decreasing=TRUE)[1])
df$GarageCond[2127] <- names(sort(table(df$GarageCond), decreasing=TRUE)[1])
df[2127, c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")]

#Show the  missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "Garage%"))]
## named numeric(0)

Now, that all the NA’s are handled, I am going to evaluate the character variables. We have 4 character variables. Of these there are 3 ordered/ordinal variables (GarageFinish, GarageQual, GarageCond) and 1 unordered/multinomial (GarageType).

table(df$GarageType)

 2Types  Attchd Basment BuiltIn CarPort  Detchd    None 
     23    1723      36     186      15     779     157 
table(df$GarageFinish)

 Fin None  RFn  Unf 
 719  158  811 1231 
table(df$GarageQual)

  Ex   Fa   Gd None   Po   TA 
   3  124   24  158    5 2605 
table(df$GarageCond)

  Ex   Fa   Gd None   Po   TA 
   3   74   15  158   14 2655 

I will convert GarageType to a factor since it does not have specific ordering to its values. I will assign numerical values to other character variables and convert them to Numeric type. I will decide later if I should treat ordinal variables as numeric or convert them to factors

df$GarageType <- factor(df$GarageType)
df$GarageFinish <- as.numeric(revalue(df$GarageFinish, c("None"=0, "Unf"=1, "RFn"=2, "Fin"=3)))
df$GarageQual <- as.numeric(revalue(df$GarageQual, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
df$GarageCond <- as.numeric(revalue(df$GarageCond, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
table(df$GarageType)

 2Types  Attchd Basment BuiltIn CarPort  Detchd    None 
     23    1723      36     186      15     779     157 
table(df$GarageFinish)

   0    1    2    3 
 158 1231  811  719 
table(df$GarageQual)

   0    1    2    3    4    5 
 158    5  124 2605   24    3 
table(df$GarageCond)

   0    1    2    3    4    5 
 158   14   74 2655   15    3 

3.4.3 Basement Variables

There are 11 Basement variables and all of them have NA’s.

df_NA[c(which(names(df_NA) %like% "%Bsmt%"))]
    BsmtCond BsmtExposure     BsmtQual BsmtFinType2 BsmtFinType1 
          82           82           81           80           79 
BsmtFullBath BsmtHalfBath   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF 
           2            2            1            1            1 
 TotalBsmtSF 
           1 

Among these variables I am going to figure out which of the variables BsmtCond, BsmtExposure, BsmtQual, BsmtFinType2 have commond NA row values with BsmtFinType1

length(which(is.na(df$BsmtCond) & is.na(df$BsmtExposure) & is.na(df$BsmtQual) & is.na(df$BsmtFinType2)  & is.na(df$BsmtFinType1)))
[1] 79

There are 79 rows that common across BsmtCond, BsmtExposure, BsmtQual, BsmtFinType2 and BsmtFinType1. Let’s fix the values for these variables by selecting the rows based on BsmtFinType1 NA’s since we know for sure that BsmtFinType1 NA’s are in the 82,81,80 NA’s for other Basement variables. We assign the value ‘None’ to all NA’s based on the variable description.

df$BsmtCond[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtExposure[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtQual[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtFinType2[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtFinType1[is.na(df$BsmtFinType1)] <- 'None'

#Show the  missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Bsmt%"))]
    BsmtCond BsmtExposure     BsmtQual BsmtFullBath BsmtHalfBath 
           3            3            2            2            2 
  BsmtFinSF1 BsmtFinType2   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF 
           1            1            1            1            1 

Now we have 3 NA’s in each BsmtCond, BsmtExposure, 2 NA’s each in BsmtQual, BsmtFullBath BsmtHalfBath and 1 NA each in BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, and TotalBsmtSF

kable(df[which(is.na(df$BsmtCond) | is.na(df$BsmtExposure) | is.na(df$BsmtQual) | is.na(df$BsmtFullBath) | is.na(df$BsmtHalfBath) | is.na(df$BsmtFinSF1) | is.na(df$BsmtFinType2) | is.na(df$BsmtFinSF2) | is.na(df$BsmtUnfSF) | is.na(df$TotalBsmtSF)), c("BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFullBath", "BsmtHalfBath","BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")])
BsmtCond BsmtExposure BsmtQual BsmtFullBath BsmtHalfBath BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
333 TA No Gd 1 0 1124 NA 479 1603 3206
949 TA NA Gd 0 0 0 Unf 0 936 936
1488 TA NA Gd 0 0 0 Unf 0 1595 1595
2041 NA Mn Gd 1 0 1044 Rec 382 0 1426
2121 None None None NA NA NA None NA NA NA
2186 NA No TA 0 1 1033 Unf 0 94 1127
2189 None None None NA NA 0 None 0 0 0
2218 Fa No NA 0 0 0 Unf 0 173 173
2219 TA No NA 0 0 0 Unf 0 356 356
2349 TA NA Gd 0 0 0 Unf 0 725 725
2525 NA Av TA 0 0 755 Unf 0 240 995

The output above indicates,

  • Row 333 has 1 NA value for BsmtFinType2. I will impute it with a mode value
  • Rows 949, 1488, 2349 have 1 NA each for BsmtExposure. I will impute with mode value
  • Rows 2041, 2186, 2525 have 1 NA each for BsmtCond. I will impute with mode value
  • Rows 2121 and 2189 have no basement and will impute the integer variables with 0 for BsmtFullBath, BsmtHalfBath,BsmtFinSF1,BsmtFinSF2, BsmtUnfSF, TotalBsmtSF
  • Rows 2218, 2219 have 1 NA each for BsmtQual. I will impute with mode value
df$BsmtFinType2[333] <- names(sort(table(df$BsmtFinType2), decreasing=TRUE))[1]
df$BsmtExposure[c(949,1488,2349)] <- names(sort(table(df$BsmtExposure), decreasing=TRUE))[1]
df$BsmtCond[c(2041,2186,2525)] <- names(sort(table(df$BsmtCond), decreasing=TRUE))[1]
df$BsmtFullBath[c(2121,2189)] <- 0
df$BsmtHalfBath[c(2121,2189)] <- 0
df[2121, c("BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")] <- 0
df$BsmtQual[c(2218,2219)] <- names(sort(table(df$BsmtQual), decreasing=TRUE))[1]

Test if the NA’s are handled

kable(df[c(333,949,1488,2041,2121,2186,2189,2218,2219,2349,2525), c("BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFullBath", "BsmtHalfBath","BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")])
BsmtCond BsmtExposure BsmtQual BsmtFullBath BsmtHalfBath BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
333 TA No Gd 1 0 1124 Unf 479 1603 3206
949 TA No Gd 0 0 0 Unf 0 936 936
1488 TA No Gd 0 0 0 Unf 0 1595 1595
2041 TA Mn Gd 1 0 1044 Rec 382 0 1426
2121 None None None 0 0 0 None 0 0 0
2186 TA No TA 0 1 1033 Unf 0 94 1127
2189 None None None 0 0 0 None 0 0 0
2218 Fa No TA 0 0 0 Unf 0 173 173
2219 TA No TA 0 0 0 Unf 0 356 356
2349 TA No Gd 0 0 0 Unf 0 725 725
2525 TA Av TA 0 0 755 Unf 0 240 995

There are 5 character Basement variables, 3 of which are ordered (BsmtQual, BsmtCond, BsmtExposure) and 2 of which are unordered (BsmtFinType1, BsmtFinType2). I will convert the unordered variables to factors and assign numeric values to ordered varaibles and convert them to numeric types.

cbind(sapply(df[basement_vars], class))
             [,1]       
BsmtQual     "character"
BsmtCond     "character"
BsmtExposure "character"
BsmtFinType1 "character"
BsmtFinSF1   "numeric"  
BsmtFinType2 "character"
BsmtFinSF2   "numeric"  
BsmtUnfSF    "numeric"  
TotalBsmtSF  "numeric"  
BsmtFullBath "numeric"  
BsmtHalfBath "numeric"  
table(df$BsmtQual)

  Ex   Fa   Gd None   TA 
 258   88 1209   79 1285 
table(df$BsmtCond)

  Fa   Gd None   Po   TA 
 104  122   79    5 2609 
table(df$BsmtExposure)

  Av   Gd   Mn   No None 
 418  276  239 1907   79 
table(df$BsmtFinType1)

 ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
 429  269  849  154   79  288  851 
table(df$BsmtFinType2)

 ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
  52   68   34   87   79  105 2494 
df$BsmtFinType1 <- as.factor(df$BsmtFinType1)
df$BsmtFinType2 <- as.factor(df$BsmtFinType2)
df$BsmtQual <-  as.integer(revalue(df$BsmtQual, c("None"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
df$BsmtCond <-  as.integer(revalue(df$BsmtCond, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4)))
df$BsmtExposure <-  as.integer(revalue(df$BsmtExposure, c("None"=0, "No"=1, "Mn"=2, "Av"=3, "Gd"=4)))
table(df$BsmtQual)

   0    1    2    3    4 
  79   88 1285 1209  258 
table(df$BsmtCond)

   0    1    2    3    4 
  79    5  104 2609  122 
table(df$BsmtExposure)

   0    1    2    3    4 
  79 1907  239  418  276 
table(df$BsmtFinType1)

 ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
 429  269  849  154   79  288  851 
table(df$BsmtFinType2)

 ALQ  BLQ  GLQ  LwQ None  Rec  Unf 
  52   68   34   87   79  105 2494 

3.4.4 Pool Variables

There are 2 pool variables and only variable PoolQC have 2909 NA’s.

df_NA[c(which(names(df_NA) %like% pool_vars))]
PoolQC 
  2909 

For variable PoolQC, based on the description I am going to replace NA’s with None if the PoolArea is 0. It appears that there are 2906 PoolQA NA’s since there are 2906 PoolArea values with 0.

df$PoolQC[df$PoolArea==0]  <- 'None'
#Show the  missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Pool%"))]
PoolQC 
     3 
df[(is.na(df$PoolQC)), c("PoolQC", "PoolArea")]

We need to find the 3 missing PoolQC NA’s and their respective PoolArea values. Rows 2421, 2504, 2600 have PoolArea values but the PoolQC is missing. The NA values will be imputed with the mode

df$PoolQC[c(2421,2504,2600)] <- names(sort(table(df$PoolQC), decreasing=TRUE))[2]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Pool%"))]
named numeric(0)
df$PoolQC[c(2421,2504,2600)]
[1] "Ex" "Ex" "Ex"

There is 1 Pool ordinal character variable and I will assign numeric values and convert it to integer type.

cbind(sapply(df[pool_vars], class))
         [,1]       
PoolArea "integer"  
PoolQC   "character"
table(df$PoolQC)

  Ex   Fa   Gd None 
   7    2    4 2906 
df$PoolQC <-  as.integer(revalue(df$PoolQC, c("None"=0, "Fa"=1, "Gd"=2, "Ex"=3)))
table(df$PoolQC)

   0    1    2    3 
2906    2    4    7 

3.4.5 Porch Variables

There are 5 porch variables and no NA’s

WoodDeckSF: Wood deck area in square feet

OpenPorchSF: Open porch area in square feet

EnclosedPorch: Enclosed porch area in square feet

3SsnPorch: Three season porch area in square feet

ScreenPorch: Screen porch area in square feet

There are 5 integer porch variables and no NA’s.

df_NA[c(which(names(df_NA) %like% porch_vars))]
named numeric(0)
cbind(sapply(df[porch_vars], class))
              [,1]     
WoodDeckSF    "integer"
OpenPorchSF   "integer"
EnclosedPorch "integer"
X3SsnPorch    "integer"
ScreenPorch   "integer"

3.4.6 Sale Variables

There are 4 sale variables out of which SalePrice and SaleType have NA’s. The 1459 NA’s in SalePrice are due to the addition of missing ‘SalePrice’ column with NA’s assigned in test data that we need to predict.

df_NA[c(which(names(df_NA) %like% sale_vars))]
SalePrice  SaleType 
     1459         1 

For SaleType ‘NA’ values, I will impute with the mode value. There are 2 character variables, ‘SaleType’ and ‘SaleCondition’. Both the character variables are unordered and will convert them to factors

df$SaleType[is.na(df$SaleType)] <- names(sort(table(df$SaleType), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Sale%"))]
SalePrice 
     1459 
cbind(sapply(df[sale_vars], class))
              [,1]       
MoSold        "integer"  
YrSold        "integer"  
SaleType      "character"
SaleCondition "character"
SalePrice     "integer"  
table(df$SaleType)

  COD   Con ConLD ConLI ConLw   CWD   New   Oth    WD 
   87     5    26     9     8    12   239     7  2526 
table(df$SaleCondition)

Abnorml AdjLand  Alloca  Family  Normal Partial 
    190      12      24      46    2402     245 
df$SaleType <- as.factor(df$SaleType)
df$SaleCondition <- as.factor(df$SaleCondition)

3.4.7 Lot and Land Variables

There are 6 variables related to Lot and Land. Variable ‘LotFrontage’ has 486 NA’s.

df_NA[c(which(names(df_NA) %like% lot_vars))]
LotFrontage 
        486 
cbind(sapply(df[lot_vars], class))
            [,1]       
LotFrontage "integer"  
LotArea     "integer"  
LotShape    "character"
LandContour "character"
LotConfig   "character"
LandSlope   "character"

The NA’s for the LotFrontage will be imputed with median value per Neighborhood variable since the values will be within bounds, while imputing with median can make the data look normally distributed.

lot_agg_median <- aggregate(list("LotFrontage_median"=df$LotFrontage), by=list("Neighborhood"=df$Neighborhood), FUN=median, na.rm=TRUE)
lot_agg_median
barplot(lot_agg_median$LotFrontage, names.arg=lot_agg_median$Neighborhood, main="Median LotFrontage by Neighborhood", xlab="Neighborhoods", ylab="Median Lot Frontage", col="blue")

for (i in 1:nrow(df)){
  if(is.na(df$LotFrontage[i])){
    df$LotFrontage[i] <- as.integer(median(df$LotFrontage[df$Neighborhood==df$Neighborhood[i]], na.rm=TRUE))
  }
}

NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Lot%"))]
named numeric(0)

There are 4 character variables. LotConfig and LandContour variables look like unordered varaibles and will convert them to factors. LotShape and LandSlope have ordinal values hence will assign numeric values and convert them to numeric variables

table(df$LotShape)

 IR1  IR2  IR3  Reg 
 968   76   16 1859 
table(df$LandContour)

 Bnk  HLS  Low  Lvl 
 117  120   60 2622 
table(df$LotConfig)

 Corner CulDSac     FR2     FR3  Inside 
    511     176      85      14    2133 
table(df$LandSlope)

 Gtl  Mod  Sev 
2778  125   16 
df$LotConfig <- as.factor(df$LotConfig)
df$LandContour <- as.factor(df$LandContour)
df$LotShape <- as.numeric(revalue(df$LotShape, c('IR3'=0, 'IR2'=1, 'IR1'=2, 'Reg'=3)))
df$LandSlope <- as.numeric(revalue(df$LandSlope, c('Sev'=0, 'Mod'=1, 'Gtl'=2)))
table(df$LotShape)

   0    1    2    3 
  16   76  968 1859 
table(df$LandContour)

 Bnk  HLS  Low  Lvl 
 117  120   60 2622 
table(df$LotConfig)

 Corner CulDSac     FR2     FR3  Inside 
    511     176      85      14    2133 
table(df$LandSlope)

   0    1    2 
  16  125 2778 

3.4.8 Dwelling Variables

There are 7 Dwelling varaibles with no NA’s. There are 2 character variables that are unordered hence I will convert them to factors. ‘MSSubClass’ is an integer variable that should be converted into a factor. Based on the description, it identifies the type of dwelling involved in the sale. It is coded numerically and is an unordered variable.

NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% dwelling_vars))]
named numeric(0)
cbind(sapply(df[dwelling_vars], class))
             [,1]       
MSSubClass   "integer"  
BldgType     "character"
HouseStyle   "character"
OverallQual  "integer"  
OverallCond  "integer"  
YearBuilt    "integer"  
YearRemodAdd "integer"  
table(df$BldgType)

  1Fam 2fmCon Duplex  Twnhs TwnhsE 
  2425     62    109     96    227 
table(df$HouseStyle)

1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer   SLvl 
   314     19   1471      8     24    872     83    128 
df$BldgType <- as.factor(df$BldgType)
df$HouseStyle <- as.factor(df$HouseStyle)
df$MSSubClass <- as.factor(as.character(df$MSSubClass))
cbind(sapply(df[dwelling_vars], class))
             [,1]     
MSSubClass   "factor" 
BldgType     "factor" 
HouseStyle   "factor" 
OverallQual  "integer"
OverallCond  "integer"
YearBuilt    "integer"
YearRemodAdd "integer"

3.4.9 Exterior Features

There are 13 exterior variables out of which 12 are character variables

# NA's for the exterior variables
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% exterior_vars))]
      Alley       Fence  MasVnrType  MasVnrArea Exterior1st Exterior2nd 
       2721        2348          24          23           1           1 
cbind(sapply(df[exterior_vars], class))
            [,1]       
Street      "character"
Alley       "character"
RoofStyle   "character"
RoofMatl    "character"
Exterior1st "character"
Exterior2nd "character"
MasVnrType  "character"
MasVnrArea  "integer"  
ExterQual   "character"
ExterCond   "character"
Foundation  "character"
PavedDrive  "character"
Fence       "character"

Fixing ‘Alley’ and ‘Fence’ variables

df$Alley[is.na(df$Alley)] <- 'None'
df$Fence[is.na(df$Fence)] <- 'None'
df[is.na(df$MasVnrType), c("MasVnrType", "MasVnrArea")]

It is evident that row 2611 has MasVnrArea and a missing MasVnrType. I am going to impute the ‘NA’ with a mode (excluding ‘None’ if it is the most frequent).

df$MasVnrType[is.na(df$MasVnrArea)] <- 'None'
df$MasVnrArea[is.na(df$MasVnrArea)] <- 0
df[2611, "MasVnrType"] <- names(sort(table(df$MasVnrType), decreasing=TRUE))[2]
df[2611, c("MasVnrType", "MasVnrArea")]
df[which(is.na(df$Exterior1st) | is.na(df$Exterior2nd)), c("Exterior1st", "Exterior2nd")]
df$Exterior1st[is.na(df$Exterior1st)] <- names(sort(-table(df$Exterior1st)))[1]
df$Exterior2nd[is.na(df$Exterior2nd)] <- names(sort(-table(df$Exterior2nd)))[1]

NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Exter%", "%Roof%", "%MasVnr%", "%Foundation%", "%Street%", "%Alley%", "%PavedDrive%", "%Fence%")))]
named numeric(0)

Variables ‘Street’, ‘Alley’, ‘RoofStyle’, ‘RoofMatl’, ‘Exterior1st’, ‘Exterior2nd’, ‘MasVnrType’, ‘Foundation’ and ‘PavedDrive’, “Fence” are unordered variables and will convert them to factors. Variables ‘ExterQual’, ‘ExterCond’ are ordered variables and assign numeric values to them an convert them to integer variables.

df[,c("Street", "Alley", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "PavedDrive", "Fence")] <- lapply(df[,c("Street", "Alley", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "PavedDrive", "Fence")], factor) 

df$ExterQual <-  as.integer(revalue(df$ExterQual, c("Fa"=0, "TA"=1, "Gd"=2, "Ex"=3)))
df$ExterCond <-  as.integer(revalue(df$ExterCond, c("Po"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
table(df$ExterQual)

   0    1    2    3 
  35 1798  979  107 
table(df$ExterCond)

   0    1    2    3    4 
   3   67 2538  299   12 
table(df$Foundation)

BrkTil CBlock  PConc   Slab  Stone   Wood 
   311   1235   1308     49     11      5 
table(df$PavedDrive)

   N    P    Y 
 216   62 2641 
table(df$Fence)

GdPrv  GdWo MnPrv  MnWw  None 
  118   112   329    12  2348 
cbind(sapply(df[exterior_vars], class))
            [,1]     
Street      "factor" 
Alley       "factor" 
RoofStyle   "factor" 
RoofMatl    "factor" 
Exterior1st "factor" 
Exterior2nd "factor" 
MasVnrType  "factor" 
MasVnrArea  "numeric"
ExterQual   "integer"
ExterCond   "integer"
Foundation  "factor" 
PavedDrive  "factor" 
Fence       "factor" 

3.4.10 Utility Variables

There are 5 utilities variables. Variable ‘Utilities’ has 2 NAs and ‘Electrical’ has 1 NA. All are character variables

cbind(sapply(df[utility_vars], class))
           [,1]       
Utilities  "character"
Heating    "character"
HeatingQC  "character"
CentralAir "character"
Electrical "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% utility_vars))]
 Utilities Electrical 
         2          1 
df[is.na(df$Utilities) | is.na(df$Electrical), c("Utilities", "Electrical")]

Fixing the variables with the mode values.

df[1380, "Electrical"] <- names(sort(table(df$Electrical), decreasing=TRUE))[1]
df[c(1916, 1946), c("Utilities")] <- names(sort(table(df$Utilities), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% utility_vars))]
named numeric(0)
table(df$Utilities)

AllPub NoSeWa 
  2918      1 
table(df$Heating)

Floor  GasA  GasW  Grav  OthW  Wall 
    1  2874    27     9     2     6 
table(df$HeatingQC)

  Ex   Fa   Gd   Po   TA 
1493   92  474    3  857 
table(df$CentralAir)

   N    Y 
 196 2723 
table(df$Electrical)

FuseA FuseF FuseP   Mix SBrkr 
  188    50     8     1  2672 

Variables ‘Utilities’ and ‘Heating’ and ‘Electrical’ are unordered and hence will convert them to factors. Variables ‘HeatingQC’ and ‘CentralAir’ are ordered and will assigned numeric values and convert them to numeric type.

df$Utilities <- as.factor(df$Utilities)
df$Heating <- as.factor(df$Heating)
df$Electrical <- as.factor(df$Electrical)
df$HeatingQC <-  as.integer(revalue(df$HeatingQC, c("Po"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
df$CentralAir <-  as.integer(revalue(df$CentralAir, c("N"=0, "Y"=1)))
table(df$Utilities)

AllPub NoSeWa 
  2918      1 
table(df$Heating)

Floor  GasA  GasW  Grav  OthW  Wall 
    1  2874    27     9     2     6 
table(df$HeatingQC)

   0    1    2    3    4 
   3   92  857  474 1493 
table(df$CentralAir)

   0    1 
 196 2723 
table(df$Electrical)

FuseA FuseF FuseP   Mix SBrkr 
  188    50     8     1  2672 
cbind(sapply(df[utility_vars], class))
           [,1]     
Utilities  "factor" 
Heating    "factor" 
HeatingQC  "integer"
CentralAir "integer"
Electrical "factor" 

3.4.11 Interior Features

cbind(sapply(df[interior_vars], class))
             [,1]       
X1stFlrSF    "integer"  
X2ndFlrSF    "integer"  
LowQualFinSF "integer"  
GrLivArea    "integer"  
FullBath     "integer"  
HalfBath     "integer"  
BedroomAbvGr "integer"  
KitchenAbvGr "integer"  
KitchenQual  "character"
TotRmsAbvGrd "integer"  
Functional   "character"
Fireplaces   "integer"  
FireplaceQu  "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% interior_vars))]
FireplaceQu  Functional KitchenQual 
       1420           2           1 

Handling missing data

df$FireplaceQu[is.na(df$FireplaceQu)] <- 'None'
df[is.na(df$Functional) | is.na(df$KitchenQual), c("Functional", "KitchenQual")] 
df$KitchenQual[is.na(df$KitchenQual)] <- names(sort(table(df$KitchenQual), decreasing=TRUE))[1]
df$Functional[is.na(df$Functional)] <- names(sort(table(df$Functional), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% interior_vars))]
named numeric(0)
table(df$FireplaceQu)

  Ex   Fa   Gd None   Po   TA 
  43   74  744 1420   46  592 
table(df$Functional)

Maj1 Maj2 Min1 Min2  Mod  Sev  Typ 
  19    9   65   70   35    2 2719 
table(df$KitchenQual)

  Ex   Fa   Gd   TA 
 205   70 1151 1493 

All the three character variables are ordinal and will assign integer values and convert them to integer type variables.

df$FireplaceQu <-  as.integer(revalue(df$FireplaceQu, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
df$Functional <-  as.integer(revalue(df$Functional, c("Sal"=0, "Sev"=1, "Maj2"=2, "Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7)))
The following `from` values were not present in `x`: Sal
df$KitchenQual <-  as.integer(revalue(df$KitchenQual, c("Fa"=0, "TA"=1, "Gd"=2, "Ex"=3)))
table(df$FireplaceQu)

   0    1    2    3    4    5 
1420   46   74  592  744   43 
table(df$Functional)

   1    2    3    4    5    6    7 
   2    9   19   35   70   65 2719 
table(df$KitchenQual)

   0    1    2    3 
  70 1493 1151  205 

3.4.12 Miscellaneous Variables

There are 2 varaibles and ‘MiscFeature’ has 2814 NA’s

cbind(sapply(df[misc_vars], class))
            [,1]       
MiscFeature "character"
MiscVal     "integer"  
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Misc%")))]
MiscFeature 
       2814 

I will assign ‘None’ to all the NA’s in this variable and convert it to a factor since there is no order and it is a multinomial variable.

df$MiscFeature[is.na(df$MiscFeature)] <- "None"
table(df$MiscFeature)

Gar2 None Othr Shed TenC 
   5 2814    4   95    1 
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Misc%")))]
named numeric(0)
df$MiscFeature <- as.factor(df$MiscFeature)
cbind(sapply(df[misc_vars], class))
            [,1]     
MiscFeature "factor" 
MiscVal     "integer"

3.4.13 Zoning Variables

MSZoning: Identifies the general zoning classification of the sale.

   A    Agriculture
   C    Commercial
   FV   Floating Village Residential
   I    Industrial
   RH   Residential High Density
   RL   Residential Low Density
   RP   Residential Low Density Park 
   RM   Residential Medium Density
   
cbind(sapply(df[zoning_vars], class))
         [,1]       
MSZoning "character"
table(df[zoning_vars])

C (all)      FV      RH      RL      RM 
     25     139      26    2265     460 
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% zoning_vars))]
MSZoning 
       4 

There are 4 NA in ‘MSZoning’ variable and I will impute it with mode. The variable is not ordinal and hence will convert it to a factor.

df$MSZoning[is.na(df$MSZoning)] <- names(sort(table(df$MSZoning), decreasing=TRUE))[1]

NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% zoning_vars))]
named numeric(0)
df$MSZoning <- as.factor(df$MSZoning)
cbind(sapply(df[zoning_vars], class))
         [,1]    
MSZoning "factor"

3.4.14 Community and Neighborhood Variables

There are 3 varaibles with no NA’s

cbind(sapply(df[neighborhood_vars], class))
             [,1]       
Neighborhood "character"
Condition1   "character"
Condition2   "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% neighborhood_vars))]
named numeric(0)
table(df$Neighborhood)

Blmngtn Blueste  BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert 
     28      10      30     108      44     267     103     194     165 
 IDOTRR MeadowV Mitchel   NAmes NoRidge NPkVill NridgHt  NWAmes OldTown 
     93      37     114     443      71      23     166     131     239 
 Sawyer SawyerW Somerst StoneBr   SWISU  Timber Veenker 
    151     125     182      51      48      72      24 
table(df$Condition1)

Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNe   RRNn 
    92    164   2511     20     39     28     50      6      9 
table(df$Condition2)

Artery  Feedr   Norm   PosA   PosN   RRAe   RRAn   RRNn 
     5     13   2889      4      4      1      1      2 

All the three variables are unordered and will convert them to factors

df$Neighborhood <- as.factor(df$Neighborhood)
df$Condition1 <- as.factor(df$Condition1)
df$Condition2 <- as.factor(df$Condition2)
cbind(sapply(df[neighborhood_vars], class))
             [,1]    
Neighborhood "factor"
Condition1   "factor"
Condition2   "factor"

3.5 Feature Engineering & Feature Elimination

Now that the variable handling is done, I will explore the data further to understand the relationship between variables and their importance as below -

  1. Check linear combination of numeric variables
  2. Skew and Kurtosis of integer variables
  3. check the correlation of the integer variables
  4. check variable importance using random forests
### Group the factor and integer variables
numeric_cols <- unlist(sapply(df, is.numeric))
numeric_col_names <- names(df[,numeric_cols])
factor_cols <- unlist(sapply(df, is.factor))
names(df[,factor_cols])
 [1] "MSSubClass"    "MSZoning"      "Street"        "Alley"        
 [5] "LandContour"   "Utilities"     "LotConfig"     "Neighborhood" 
 [9] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
[13] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
[17] "MasVnrType"    "Foundation"    "BsmtFinType1"  "BsmtFinType2" 
[21] "Heating"       "Electrical"    "GarageType"    "PavedDrive"   
[25] "Fence"         "MiscFeature"   "SaleType"      "SaleCondition"


#Find linear combinations and eliminate the combination variables. Do not include SalePrice variable.
findLinearCombos(df[,numeric_col_names[-52]])
$linearCombos
$linearCombos[[1]]
[1] 18 15 16 17

$linearCombos[[2]]
[1] 24 21 22 23


$remove
[1] 18 24

### Skew and Kurtosis of integer variables prior to any transformations, feature engineering or one-hot encoding
as.data.frame(psych::describe(df[,numeric_cols]))[, c("mean", "median", "sd", "skew", "kurtosis")]

findLinearCombos() identified that 1. TotalBsmtSF is a cumlative sum of “BsmtFinSF1” “BsmtFinSF2” “BsmtUnfSF” 2. “GrLivArea” is a cumlative sum of “X1stFlrSF” “X2ndFlrSF” “LowQualFinSF”


### Get a correlation of int variables

#### Plot the correlations of all numeric variables ####
df_corr <- cor(df[,numeric_cols], use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
df_corr_sorted <- as.matrix(sort(df_corr[,'SalePrice'], decreasing = TRUE))
#select only high corelations with Sales Price
df_high_corr <- names(which(apply(df_corr_sorted, 1, function(x) abs(x)>0.4)))
df_corr_matrix <- df_corr[df_high_corr, df_high_corr]
#corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", upper="circle", lower="number")
corrplot(df_corr_matrix, method="number", tl.col="black", tl.srt=45)

The correlation matrix above indicates the variables that are highly correlated with SalePrice. There is also a high correlation among some predictor variables. Inorder to reduce the effects of multi-collinearity we need to eliminate predictor variables that are highly correlated with other predictor variables. Before eliminating the variables I am going to run a Random Forests model for a test of significance and important variables.

set.seed(100)
#check variable importance using random forests
#rf_ranges <- list(ntree=c(500), mtry=5:30)
#rf_tune <- tune(randomForest, SalePrice ~ ., data=df[1:1460,], ranges=rf_ranges) 
#rf_tune$best.parameters
#rf_best <- rf_tune$best.model
#rf_best
df_rf <- randomForest(x=df[1:1460, -80], y=df$SalePrice[1:1460], ntree=500, importance=TRUE)
varImpPlot(df_rf)

The left plot above indicates the increase in MSE if a variable is taken out of the model. The right plot indicates the node purity, which is a loss function using which the splits are chosen. The loss function is MSE for regression and GINI-impurity for classification.

Now, I will proceed with feature engineering and feature elimination. I will start with each of the variable groups above, run a pair-wise correlation of each group with SalePrice while keeping in view the above correlation matrix and variable importance.

cor(df[,c("SalePrice", "GarageCars", "GarageArea", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond")], use="pairwise.complete.obs")
             SalePrice GarageCars GarageArea GarageYrBlt GarageFinish
SalePrice    1.0000000  0.6404092  0.6234314   0.2613664    0.5492468
GarageCars   0.6404092  1.0000000  0.8898902   0.5806926    0.5771599
GarageArea   0.6234314  0.8898902  1.0000000   0.5502094    0.5120565
GarageYrBlt  0.2613664  0.5806926  0.5502094   1.0000000    0.4830923
GarageFinish 0.5492468  0.5771599  0.5120565   0.4830923    1.0000000
GarageQual   0.2738391  0.5666054  0.5532910   0.9441342    0.4864983
GarageCond   0.2631908  0.5562588  0.5402007   0.9493199    0.4799565
             GarageQual GarageCond
SalePrice     0.2738391  0.2631908
GarageCars    0.5666054  0.5562588
GarageArea    0.5532910  0.5402007
GarageYrBlt   0.9441342  0.9493199
GarageFinish  0.4864983  0.4799565
GarageQual    1.0000000  0.9463743
GarageCond    0.9463743  1.0000000

The count of Garage Cars is directly proportional to the Garage Area. Garage cars has a high correlation with SalePrice than GarageArea. Hence we drop the variable GarageArea. Similarly we drop GarageFinish. For now I am going to create list of variables that will be dropped and append to this list as I identify more.

drop_vars <- list("GarageArea", "GarageFinish")

Predictor variable TotalBsmtSF is the cumulative sum of predictor variables BsmtFinSF1,BsmtFinSF2, BsmtUnfSF Correlation between TotalBsmtSF and the other Bsmt SquareFoot variable is 1. Hence dropping the variables BsmtFinSF1,BsmtFinSF2, BsmtUnfSF.

cor(df$TotalBsmtSF, (df$BsmtFinSF1+df$BsmtFinSF2+df$BsmtUnfSF))
[1] 1
cor(df[,c("SalePrice", "BsmtQual", "BsmtCond", "BsmtExposure", "TotalBsmtSF", "BsmtFullBath")], use="pairwise.complete.obs")
             SalePrice  BsmtQual  BsmtCond BsmtExposure TotalBsmtSF
SalePrice    1.0000000 0.6229247 0.2126072    0.3750450   0.6135806
BsmtQual     0.6229247 1.0000000 0.5359296    0.4386610   0.5653541
BsmtCond     0.2126072 0.5359296 1.0000000    0.2701941   0.3854455
BsmtExposure 0.3750450 0.4386610 0.2701941    1.0000000   0.4081805
TotalBsmtSF  0.6135806 0.5653541 0.3854455    0.4081805   1.0000000
BsmtFullBath 0.2271222 0.2614480 0.1708422    0.3381548   0.3262518
             BsmtFullBath
SalePrice       0.2271222
BsmtQual        0.2614480
BsmtCond        0.1708422
BsmtExposure    0.3381548
TotalBsmtSF     0.3262518
BsmtFullBath    1.0000000
drop_vars <- list.append(drop_vars, "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF")

Decks are usually open and porches either have a roof or are screened.

Hence, I am going to consolidate all the porch variables and delete “OpenPorchSF”, “EnclosedPorch”, “X3SsnPorch”, “ScreenPorch” variables.

df$TotalPorchSF <- df$OpenPorchSF + df$EnclosedPorch + df$X3SsnPorch + df$ScreenPorch
cor(df[,c("SalePrice", "TotalPorchSF", "WoodDeckSF")], use="pairwise.complete.obs")
             SalePrice TotalPorchSF  WoodDeckSF
SalePrice    1.0000000   0.19573894  0.32441344
TotalPorchSF 0.1957389   1.00000000 -0.07489956
WoodDeckSF   0.3244134  -0.07489956  1.00000000
plot(df$TotalPorchSF, df$SalePrice)

drop_vars <- list.append(drop_vars, "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch" )
head(df$SalePrice)
[1] 208500 181500 223500 140000 250000 143000

Adding all the porch variables did not seem to increase the correlation with SalePrice

GrLivArea is a cumulative sum of X1stFlrSF, X2ndFlrSF, LowQualFinSF and have a correlation of 1 with GrLivArea. Dropping X1stFlrSF, X2ndFlrSF, LowQualFinSF inorder to reduce multi-collinearity effects.

I am also going to consolidate the variables GrLivArea and TotalBsmtSF into TotalSF and drop the two variables

cor(df$GrLivArea, (df$X1stFlrSF+df$X2ndFlrSF+df$LowQualFinSF))
[1] 1
df$TotalSF <- df$GrLivArea + df$TotalBsmtSF
cor(df[,c("SalePrice", "GrLivArea", "TotalBsmtSF", "TotalSF")], use="pairwise.complete.obs")
            SalePrice GrLivArea TotalBsmtSF   TotalSF
SalePrice   1.0000000 0.7086245   0.6135806 0.7789588
GrLivArea   0.7086245 1.0000000   0.4457098 0.8717698
TotalBsmtSF 0.6135806 0.4457098   1.0000000 0.8271178
TotalSF     0.7789588 0.8717698   0.8271178 1.0000000
drop_vars <- list.append(drop_vars, "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF", "GrLivArea", "TotalBsmtSF")

I will not touch the Quality variables since buyers depend on each quality variable first before being satisfied with the overall quality of the house.

Next, I am going to consolidate all the bathroom variables since buyers are interested in total bathroom count. I will multiply the half baths with .05 inorder to sum it up correctly. Below correlation clearly indicates that TotalBathRooms variable is a strong predictor than individual predictors. Thus, dropping all the Bath predictor variables and retaining the TotalBathRooms variable.

df$TotalBathRooms <- df$BsmtFullBath +  (df$BsmtHalfBath * 0.5) + df$FullBath + (df$HalfBath * 0.5)
cor(df[,c("SalePrice", "TotalBathRooms", "BsmtFullBath", "BsmtHalfBath",  "FullBath" ,  "HalfBath")], use="pairwise.complete.obs")
                 SalePrice TotalBathRooms BsmtFullBath BsmtHalfBath
SalePrice       1.00000000     0.63173107   0.22712223  -0.01684415
TotalBathRooms  0.63173107     1.00000000   0.60252510   0.00501479
BsmtFullBath    0.22712223     0.60252510   1.00000000  -0.14865481
BsmtHalfBath   -0.01684415     0.00501479  -0.14865481   1.00000000
FullBath        0.56066376     0.71326860  -0.01922449  -0.04711656
HalfBath        0.28410768     0.38848184  -0.03358243  -0.05845680
                  FullBath    HalfBath
SalePrice       0.56066376  0.28410768
TotalBathRooms  0.71326860  0.38848184
BsmtFullBath   -0.01922449 -0.03358243
BsmtHalfBath   -0.04711656 -0.05845680
FullBath        1.00000000  0.15838609
HalfBath        0.15838609  1.00000000
drop_vars <- list.append(drop_vars, "BsmtFullBath", "BsmtHalfBath",  "FullBath" ,  "HalfBath")

I am going to take a look at the Room predictor variables. TotRmsAbvGrd is highly correlated with SalePrice, however, above correlation matrix indicates a high correlation with GrLivArea. Since GrLivArea is highly correlated with SalePrice than TotRmsAbvGrd is with SalePrice, we will drop TotRmsAbvGrd variable.

cor(df[,c("SalePrice", "BedroomAbvGr",  "KitchenAbvGr", "TotRmsAbvGrd")], use="pairwise.complete.obs")
              SalePrice BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
SalePrice     1.0000000    0.1682132   -0.1359074    0.5337232
BedroomAbvGr  0.1682132    1.0000000    0.2411822    0.6697372
KitchenAbvGr -0.1359074    0.2411822    1.0000000    0.2945694
TotRmsAbvGrd  0.5337232    0.6697372    0.2945694    1.0000000
drop_vars <- list.append(drop_vars, "TotRmsAbvGrd")

Since buyers are interested in the age of the home. I am going to created a new variable ‘Age’ and calculate based on the YearBuilt, YrSold and MonthSold variables. I also converted month into year unit by dividing by 12. A plot of Age and SalePrice indicate a negative correlation. Dropping variables “MoSold”, “YrSold”, because they have a low positive correlation with “SalePrice”. Dropping “YearBuilt” since it has a high correlation with SalePrice and Age variables.

df$Age <- round(df$YrSold+df$MoSold/12 - df$YearBuilt,2)
cor(df[, c("SalePrice", "Age", "MoSold", "YrSold", "YearBuilt")],use="pairwise.complete.obs")
            SalePrice         Age      MoSold      YrSold   YearBuilt
SalePrice  1.00000000 -0.52306381  0.04643225 -0.02892259  0.52289733
Age       -0.52306381  1.00000000 -0.01312782  0.05453331 -0.99908182
MoSold     0.04643225 -0.01312782  1.00000000 -0.15389512  0.01393760
YrSold    -0.02892259  0.05453331 -0.15389512  1.00000000 -0.01234419
YearBuilt  0.52289733 -0.99908182  0.01393760 -0.01234419  1.00000000
plot(df$Age, df$SalePrice)

hist(df$Age)

drop_vars <- list.append(drop_vars, "MoSold", "YrSold", "YearBuilt")

Dropping the variables below and getting a list of numeric and character variables

drop_vars <- unlist(drop_vars)
df <- df[!names(df) %in% drop_vars]
numeric_cols <- unlist(sapply(df, is.numeric))
names(df[,numeric_cols])
 [1] "LotFrontage"    "LotArea"        "LotShape"       "LandSlope"     
 [5] "OverallQual"    "OverallCond"    "YearRemodAdd"   "MasVnrArea"    
 [9] "ExterQual"      "ExterCond"      "BsmtQual"       "BsmtCond"      
[13] "BsmtExposure"   "HeatingQC"      "CentralAir"     "BedroomAbvGr"  
[17] "KitchenAbvGr"   "KitchenQual"    "Functional"     "Fireplaces"    
[21] "FireplaceQu"    "GarageYrBlt"    "GarageCars"     "GarageQual"    
[25] "GarageCond"     "WoodDeckSF"     "PoolArea"       "PoolQC"        
[29] "MiscVal"        "SalePrice"      "TotalPorchSF"   "TotalSF"       
[33] "TotalBathRooms" "Age"           
factor_cols <- unlist(sapply(df, is.factor))
names(df[,factor_cols])
 [1] "MSSubClass"    "MSZoning"      "Street"        "Alley"        
 [5] "LandContour"   "Utilities"     "LotConfig"     "Neighborhood" 
 [9] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
[13] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
[17] "MasVnrType"    "Foundation"    "BsmtFinType1"  "BsmtFinType2" 
[21] "Heating"       "Electrical"    "GarageType"    "PavedDrive"   
[25] "Fence"         "MiscFeature"   "SaleType"      "SaleCondition"

3.6 Variable Transformation

In the next step, I am going to

  1. Check the skew and kurtosis values of all numeric variables using the describe() function.
  2. Histogram of variables
  3. Transform the variables based on Skew and Kurtosis values
  4. Check Correlations and variable importance again to verify if anything has changed

The numeric columns also contain the ordinal variables. I will ignore the ordinal variables and look at the original numeric variables only.

Excluding the ordinal integer variables (revalued ordinal character variables), I will focus on the following numeric variables -

“LotFrontage”, “LotArea”, “MasVnrArea”, “TotalBsmtSF”, “GrLivArea”, “BedroomAbvGr” “KitchenAbvGr”, “Fireplaces”, “GarageCars”, “WoodDeckSF”, “PoolArea”, “MiscVal”, “SalePrice”, “TotalPorchSF”, “TotalBathRooms”

All the above variables will be transformed using the YeoJohnson method with the exception of SalePrice. YeoJohnson method handles zero’s and negative values. SalePrice will use a logarithmic transformation since submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.

transform_vars <-  c("LotFrontage", "LotArea", "MasVnrArea", "TotalSF", "BedroomAbvGr",   "KitchenAbvGr", "Fireplaces", "GarageCars", "WoodDeckSF", "PoolArea",  "MiscVal", "TotalPorchSF", "TotalBathRooms", "Age")

as.data.frame(psych::describe(df[transform_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]

#Exploratory data analysis before transformation
EDA(df[transform_vars])


#Transform variables
preprocessParams <- preProcess(df[transform_vars], method=c("YeoJohnson"))
preprocessParams
Created from 2919 samples and 12 variables

Pre-processing:
  - ignored (0)
  - Yeo-Johnson transformation (12)

Lambda estimates for Yeo-Johnson transformation:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.5269 -0.1262  0.2315  0.1254  0.4265  1.2187 
preprocessParams$method
$YeoJohnson
 [1] "LotFrontage"    "LotArea"        "MasVnrArea"     "TotalSF"       
 [5] "BedroomAbvGr"   "KitchenAbvGr"   "Fireplaces"     "GarageCars"    
 [9] "WoodDeckSF"     "TotalPorchSF"   "TotalBathRooms" "Age"           

$ignore
character(0)
transformed_df <- predict(preprocessParams, df)

hist(df$SalePrice)
transformed_df$SalePrice <- log(df$SalePrice)
#head(transformed_df)
as.data.frame(psych::describe(transformed_df[transform_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
#Exploratory data analysis after transformation
EDA(transformed_df[transform_vars])

#Check variable importance post transformation
set.seed(100)
#check variable importance using random forests
#rf_ranges <- list(ntree=c(500), mtry=5:30)
#rf_tune <- tune(randomForest, SalePrice ~ ., data=df[1:1460,], ranges=rf_ranges) 
#rf_tune$best.parameters
#rf_best <- rf_tune$best.model
#rf_best
names(transformed_df)
 [1] "MSSubClass"     "MSZoning"       "LotFrontage"    "LotArea"       
 [5] "Street"         "Alley"          "LotShape"       "LandContour"   
 [9] "Utilities"      "LotConfig"      "LandSlope"      "Neighborhood"  
[13] "Condition1"     "Condition2"     "BldgType"       "HouseStyle"    
[17] "OverallQual"    "OverallCond"    "YearRemodAdd"   "RoofStyle"     
[21] "RoofMatl"       "Exterior1st"    "Exterior2nd"    "MasVnrType"    
[25] "MasVnrArea"     "ExterQual"      "ExterCond"      "Foundation"    
[29] "BsmtQual"       "BsmtCond"       "BsmtExposure"   "BsmtFinType1"  
[33] "BsmtFinType2"   "Heating"        "HeatingQC"      "CentralAir"    
[37] "Electrical"     "BedroomAbvGr"   "KitchenAbvGr"   "KitchenQual"   
[41] "Functional"     "Fireplaces"     "FireplaceQu"    "GarageType"    
[45] "GarageYrBlt"    "GarageCars"     "GarageQual"     "GarageCond"    
[49] "PavedDrive"     "WoodDeckSF"     "PoolArea"       "PoolQC"        
[53] "Fence"          "MiscFeature"    "MiscVal"        "SaleType"      
[57] "SaleCondition"  "SalePrice"      "TotalPorchSF"   "TotalSF"       
[61] "TotalBathRooms" "Age"           
trans_df_rf <- randomForest(x=transformed_df[1:1460, -63], y=transformed_df$SalePrice[1:1460], ntree=500, importance=TRUE)
varImpPlot(trans_df_rf)

trans_df_corr <- round(cor(transformed_df[c(transform_vars, "SalePrice")],use="pairwise.complete.obs"),2)
trans_cor_sorted <- as.matrix(sort(trans_df_corr[,'SalePrice'], decreasing = TRUE))
trans_cor_vars <- names(apply(trans_cor_sorted, 1, function(x) abs(x)>0))
corrplot(trans_df_corr[trans_cor_vars,trans_cor_vars], method="number", type="upper", tl.col="black", tl.srt=45)

#Scatter plot matrix of variables
pairs(~SalePrice+Age+LotFrontage,data=transformed_df, 
   main="Simple Scatterplot Matrix")

pairs(~SalePrice+LotArea+MasVnrArea,data=transformed_df, 
   main="Simple Scatterplot Matrix")

pairs(~SalePrice+TotalSF+BedroomAbvGr,data=transformed_df, 
   main="Simple Scatterplot Matrix")

pairs(~SalePrice+KitchenAbvGr+Fireplaces+GarageCars,data=transformed_df, 
   main="Simple Scatterplot Matrix")

pairs(~SalePrice+WoodDeckSF+PoolArea,data=transformed_df, 
   main="Simple Scatterplot Matrix")

pairs(~SalePrice+MiscVal+TotalPorchSF+TotalBathRooms,data=transformed_df, 
   main="Simple Scatterplot Matrix")

3.7 Graphical Analysis

Post transformation I will continue to work with ‘transfomed_df’ dataset. Now that the numeric variables are transformed. I will encode the nominal variables, in other words the factor variables. Since our dependant variable, SalePrice is an integer and the goal is to predict the SalePrice, we are going to run a regression model for which categorical data for nominal variables must be converted to numeric form. I am going to use the One-Hot encoding.

The nominal variables are as below -

factor_cols <- unlist(sapply(transformed_df, is.factor))
names(transformed_df[,factor_cols])
 [1] "MSSubClass"    "MSZoning"      "Street"        "Alley"        
 [5] "LandContour"   "Utilities"     "LotConfig"     "Neighborhood" 
 [9] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
[13] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
[17] "MasVnrType"    "Foundation"    "BsmtFinType1"  "BsmtFinType2" 
[21] "Heating"       "Electrical"    "GarageType"    "PavedDrive"   
[25] "Fence"         "MiscFeature"   "SaleType"      "SaleCondition"

3.7.1 Garage Variables

## Quick summary ##
#List of garage variables after feature engineering
garage_vars <- setdiff(garage_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[garage_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[garage_vars])
EDA(transformed_df[garage_vars])

3.7.2 Basement Variables

 ## Quick summary ##
basement_vars <- setdiff(basement_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[basement_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[basement_vars])
EDA(transformed_df[basement_vars])

3.7.3 Pool Variables

 ## Quick summary ##
pool_vars <- setdiff(pool_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[pool_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[pool_vars])
EDA(transformed_df[pool_vars])

3.7.4 Porch Variables

 ## Quick summary ##
porch_vars <- setdiff(porch_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[porch_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[porch_vars])
EDA(transformed_df[porch_vars])

3.7.5 Sale Variables

 ## Quick summary ##
sale_vars <- setdiff(sale_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[sale_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[sale_vars])
EDA(transformed_df[sale_vars])

3.7.6 Lot and Land Variables

## Quick summary ##
lot_vars <- setdiff(lot_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[lot_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[lot_vars])
EDA(transformed_df[lot_vars])

3.7.7 Dwelling Variables

## Quick summary ##
dwelling_vars <- setdiff(dwelling_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[dwelling_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[dwelling_vars])
EDA(transformed_df[dwelling_vars])

3.7.8 Exterior Features

 ## Quick summary ##
exterior_vars <- setdiff(exterior_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[exterior_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[exterior_vars])
EDA(transformed_df[exterior_vars])

3.7.9 Utility Variables

## Quick summary ##
utility_vars <- setdiff(utility_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[utility_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[utility_vars])
EDA(transformed_df[utility_vars])

3.7.10 Interior Features

 ## Quick summary ##
interior_vars <- setdiff(interior_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[interior_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[interior_vars])
EDA(transformed_df[interior_vars])

3.7.11 Miscellaneous Variables

mmisc_vars <- setdiff(misc_vars, drop_vars)
## Quick summary ##
as.data.frame(psych::describe(transformed_df[misc_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[misc_vars])
EDA(transformed_df[misc_vars])

3.7.12 Zoning Variables

## Quick summary ##
zoning_vars <- setdiff(zoning_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[zoning_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
Warning in psych::describe(transformed_df[zoning_vars]): You were trying
to describe a non-numeric data.frame or vector which describe converted to
numeric.
head(transformed_df[zoning_vars])
EDA(transformed_df[zoning_vars])

3.7.13 Community and Neighborhood Variables

## Quick summary ##
neighborhood_vars <- setdiff(neighborhood_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[neighborhood_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
head(transformed_df[neighborhood_vars])
EDA(transformed_df[neighborhood_vars])

##From CARET package
#fit_glm <= glm(transformed_df~., family="")
#varImp(fit_glm)

#A correaltion between the garage variables and
### Get a historgram of factor variables
### Identify most impact variables